# imputing oil data using missForest


oil_melt32$region[oil_melt32$country == "united kingdom"] <- "western europe"
oil_melt32$region[oil_melt32$country == "poland"] <- "eastern europe"
oil_melt32$region[oil_melt32$country == "ras al khaimah"] <- "middle east"
oil_melt32$region[oil_melt32$region == "easter europe"] <- "eastern europe"
oil_melt32$region[oil_melt32$region == "wester europe"] <- "western europe"
oil_melt32$region[oil_melt32$region == "western eurpoe"] <- "western europe"
oil_melt32$region[oil_melt32$region == "asia-pacifric"] <- "asia-pacific"
oil_melt32$region[oil_melt32$region == "asia pacific"] <- "asia-pacific"
oil_melt32$region[oil_melt32$region == "western hemipshere"] <- "western hemisphere"

oil_melt32$region <- as.character(oil_melt32$region)
oil_melt32$offshore <- as.character(oil_melt32$offshore)
oil_melt32$condensate <- as.character(oil_melt32$condensate)
oil_melt32$avgapigravity <- as.character(oil_melt32$avgapigravity)
oil_melt32$avgdepth <- as.character(oil_melt32$avgdepth)

oil_melt32$country <- ifelse(oil_melt32$country == "chi-taiwan", "chitaiwan", oil_melt32$country)
oil_melt32$region[oil_melt32$country %in% c("poland")] <- "eastern europe"
oil_melt32$region[oil_melt32$country %in% c("united kingdom")] <- "western europe"
oil_melt32$region[oil_melt32$country %in% c("saudi arabia", "ras al khaimah")] <- "middle east"

unique.region.32 <- as.character(unique(oil_melt32$region))

oil_melt32 %>% 
  distinct(region, field_new) %>% 
  plyr::ddply(
    ~region, 
    summarize, 
    count = n()
  )

max_saudi <- max(subset(oil_melt32, country == "saudi arabia")$oil, na.rm = T) * 1.01

sum(oil_melt32$oil > max_saudi, na.rm = T)

oil_melt32$oil[oil_melt32$oil > max_saudi] <- NA

oil_melt32$id <- paste0(oil_melt32$field_new, ", ", oil_melt32$region)
oil_melt32$region <- as.character(oil_melt32$region)
oil_melt32$offshore <- as.factor(as.character(oil_melt32$offshore))
oil_melt32$condensate <- as.factor(as.character(oil_melt32$condensate))
oil_melt32$avgapigravity <- as.numeric(oil_melt32$avgapigravity)
oil_melt32$avgdepth <- as.numeric(oil_melt32$avgdepth)
oil_melt32$discdate[oil_melt32$discdate < 1700] <- NA
oil_melt32$discdate_wrong <- ifelse(oil_melt32$discdate > oil_melt32$years & oil_melt32$oil > 0, 1, 0 )


temp_year <- filter(
  oil_melt32, 
  oil > 0,
  discdate_wrong == 1
) %>% 
  plyr::ddply(
    ~id, 
    summarize,
    min_year = min(years, na.rm = T)
  )


for (i in 1:nrow(temp_year))
{
  oil_melt32$discdate[oil_melt32$id == temp_year$id[i]] <- temp_year$min_year[i]
}
oil_melt32$discdate_wrong <- ifelse(oil_melt32$discdate > oil_melt32$years & oil_melt32$oil > 0, 1, 0 )
oil_melt32 <- dplyr::select(oil_melt32, -discdate_wrong)
oil_melt32$field_age <- oil_melt32$years - oil_melt32$discdate

oil_melt32 <- filter(
  oil_melt32, 
  field_age >= 0
)
sum(is.na(oil_melt32[,colnames(oil_melt32) %in% c("region", "offshore", "condensate", "avgapigravity", "avgdepth", "years", "oil", "field_age", "field_new")]))
sum(is.na(oil_melt32[,colnames(oil_melt32) %in% c("region", "offshore", "condensate", "avgapigravity", "avgdepth", "years", "oil", "field_age", "field_new")]))/(nrow(oil_melt32) * 9)
for (z in 1:length(unique.region.32))
{
  set.seed(1)
  data_to_impute <- 
    filter(
      oil_melt32,
      region == unique.region.32[z]
    ) %>% 
    mutate(
      field_old = gsub("\\.", " ", field_old),
      field_old = gsub("-", " ", field_old)
    ) %>% 
    as.data.frame(.)
  
  unique.fields.temp <- unique(data_to_impute$field_new)
  id.vars <- 
    dplyr::select(
      data_to_impute, 
      c(
        region, country, field_old, field_new, identifier, var1
      )
    )
  for (i in 1:length(unique.fields.temp))
  {
    data_to_impute$temp_var <- ifelse(data_to_impute$field_new == unique.fields.temp[i], "1", "0")
    data_to_impute$temp_var <- factor(data_to_impute$temp_var, levels = c("0", "1"))
    if (unique.fields.temp[i] == "offshore")
    {
      colnames(data_to_impute)[colnames(data_to_impute) == "temp_var"] <- "offshore_name"
    } else if (unique.fields.temp[i] == "condensate")
    {
      colnames(data_to_impute)[colnames(data_to_impute) == "temp_var"] <- "condensate_name"
    } else
    {
      colnames(data_to_impute)[colnames(data_to_impute) == "temp_var"] <- unique.fields.temp[i]  
    }
    
  }
  data_to_impute <- 
    dplyr::select(
      data_to_impute, 
      -c(
        region, country, field_old, id, discdate, field_new, identifier, var1, unique_id
      )
    )
  data_imputed <- 
    missForest(
      data_to_impute, 
      verbose = F,
      variablewise = TRUE
    )
  imputed_values <- data.frame(bind_cols(id.vars, data_imputed$ximp[,1:7]))
  dir.create(file.path(paste0("/imputed_values/")), showWarnings = FALSE)
  save(imputed_values, file = paste0("/imputed_values/", unique.region.32[z], ".Rda"))
  save(data_imputed, file = paste0("/imputed_values/data_imputed_", unique.region.32[z], ".Rda"))
  cat(paste0("\rfinished ",  unique.region.32[z], "."))
}


all_files_oil_32 <- list.files("imputed_values")
all_files_oil_32 <- all_files_oil_32[grepl("data_imputed", all_files_oil_32) == F]
all_data <- data.frame()
for (i in 1:length(all_files_oil_32))
{
  temp_data <- get({load(paste0("imputed_values/", all_files_oil_32[i]))})
  all_data <- bind_rows(
    temp_data,
    all_data
  )
}


oil_data_32_imputed <- all_data
save(oil_data_32_imputed, file = "imputed_values/oil_data_32_imputed.Rda")
